### Conversational AI and Equity: Assessing GPT-3’s Communication with Diverse Social Groups on Contentious Topics
## Coder: Anqi Shao
## Auditing: Kaiping Chen
### Computer Set Up: Windows 10/11 with at least 16GB of RAM, with R version 4.3.1.
### Figure 3 and stance detection code, written in Python, can be found here: 
### https://colab.research.google.com/drive/1G9uHVi-n_SLBDemucCDcI_FkEIcGrv6G?usp=sharing
## Jan 2024

##### Packages #####
library(dplyr)
library(tidyr)
library(readxl)
library(stringr)
library(textclean)
library(psych)
library(stargazer) # version 5.2.3
library(jtools)
library(data.table) # version 1.14.8
library(ggplot2)
#library(here)
library(tidyverse)
library(magrittr)
library(stm) # version v1.3.6
library(plotrix)
library(tm)
#library(xlsx)
require(NLP)
library(ggpubr)
library(ggstance)
library(quantreg)
library(caret)
library(magick)
library(performance)
library(gridExtra)
library(grid)

##### Functions for wrangling, analysis and visualization#####
#These functions are designed for efficiency in handling this large, repetitive dataset.

meansd <- function(x) {
  print(paste0(round(mean(x,na.rm=TRUE),2)," (",round(sd(x,na.rm=TRUE),2),")"))
} #Print mean and SD of an array in apa style

extract_results <- function(model) {
  coefs <- coef(model)
  confint <- confint(model)
  se <- sqrt(diag(vcov(model)))
  results <- data.frame(coef = coefs, se = se, conf_low = confint[,1], conf_high = confint[,2])
  return(results)
} #Extract results from a linear regression model into a dataframe

add_color <- function(df) {
  df$color <- "gray"
  df$color[df$conf_low > 0] <- "dark blue"
  df$color[df$conf_high < 0] <- "dark red"
  return(df)
} # For figure 2 and 5, function to add significance color based on confidence interval

plot_subset_noy <- function(df, iv) {
  df_subset <- df[grepl(iv, rownames(df)),]
  ggplot(df_subset, aes(x = factor(DV), y = coef,fill = color)) +
    geom_col(position = position_dodge(), color = "white",
             alpha = 0.8, width = 0.5) +
    geom_errorbar(aes(ymin = conf_low, ymax = conf_high), 
                  width = 0.05, color = "grey40", size = 0.8,
                  position = position_dodge(width = 0.5)) +
    coord_flip() +
    scale_y_continuous(limits = c(-1.25, 1.25),
                       breaks = c(-1, 1)) +
    theme_minimal() +
    theme(panel.grid = element_blank(), 
          axis.title.x = element_blank(),
          axis.text.x = element_text(size = 12),
          axis.text.y = element_blank(),
          plot.title = element_text(hjust = 0.5, size = 14)) +
    geom_hline(yintercept = 0, color = "black", size = 1) +
    scale_fill_manual(values = c("dark blue" = "steelblue", 
                                 "dark red" = "firebrick",
                                 "gray" = "grey"))+
    labs(x = "", y = "Coefficient", fill = "") +
    guides(fill = FALSE)
} # For figure 2 subplots (noy = without y axis)

plot_subset_noy_liwc <- function(df, iv) {
  df_subset <- df[grepl(iv, rownames(df)),]
  ggplot(df_subset, aes(x = factor(DV), y = coef,fill = color)) +
    geom_col(position = position_dodge(), color = "white",
             alpha = 0.8, width = 0.5) +
    geom_errorbar(aes(ymin = conf_low, ymax = conf_high), 
                  width = 0.05, color = "grey40", size = 0.8,
                  position = position_dodge(width = 0.5)) +
    coord_flip() +
    scale_y_continuous(limits = c(-0.3, 0.3),
                       breaks = c(-0.25, 0.25)) +
    theme_minimal() +
    theme(panel.grid = element_blank(), 
          axis.title.x = element_blank(),
          axis.text.x = element_text(size = 12),
          axis.text.y = element_blank(),
          plot.title = element_text(hjust = 0.5, size = 14)) +
    geom_hline(yintercept = 0, color = "black", size = 1) +
    scale_fill_manual(values = c("dark blue" = "steelblue", 
                                 "dark red" = "firebrick",
                                 "gray" = "grey"))+
    labs(x = "", y = "Coefficient", fill = "") +
    guides(fill = FALSE)
} # For figure 5 subplots (noy = without y axis)


plot_results_cc <- function(results, xlab, ylab) {
  # Rename the rownames
  results <- results[rownames(results) %in% c("ccminorminor", "langminorminor", 
                                              "raceminorminor", "eduminorminor"),]
  rownames(results) <- c( "Race/ethnicity minority","Language minority", "Education minority",
                          "Opinion minority")
  # Set the color of the bar and point to dark red if the confidence interval does not include 0, 
  # otherwise set it to dark gray
  results$color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark red","dark gray")
  results$point_color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark red","dark gray")
  results$fill_color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark red","dark gray")
  
  ggplot(results,aes(x = rownames(results), 
                     y = coef, fill = fill_color)) +
    geom_errorbar(aes(ymin = conf_low, ymax = conf_high), size = 2, width = 0, color = results$color) +
    geom_point(size = 3, color = results$point_color) +
    theme_minimal() +
    theme(panel.grid = element_blank(),
          axis.text.y = element_text(size = 12))+
    geom_hline(yintercept=0, color = "black", size=1)+
    labs(x = xlab, y = ylab) +
    coord_flip() +
    guides(fill = "none")+
    scale_y_continuous(breaks = c(-0.1,0,0.1))
} #Plot linear regression for climate change

plot_results_blm <- function(results, xlab, ylab) {
  # Rename the rownames
  results <- results[rownames(results) %in% c("blmminorminor", "langminorminor", 
                                              "raceminorminor", "eduminorminor"),]
  rownames(results) <- c( "Race/ethnicity minority","Language minority", "Education minority",
                          "Opinion minority")
  # Set the color of the bar and point to dark red if the confidence interval does not include 0, 
  # otherwise set it to dark gray
  results$color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark red","dark gray")
  results$point_color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark red","dark gray")
  results$fill_color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark red","dark gray")
  
  ggplot(results,aes(x = rownames(results), 
                     y = coef, fill = fill_color)) +
    geom_errorbar(aes(ymin = conf_low, ymax = conf_high), size = 2, width = 0, color = results$color) +
    geom_point(size = 3, color = results$point_color) +
    theme_minimal() +
    theme(panel.grid = element_blank(),
          axis.text.y = element_text(size = 12))+
    geom_hline(yintercept=0, color = "black", size=1)+
    labs(x = xlab, y = ylab) +
    coord_flip() +
    guides(fill = "none")+
    scale_y_continuous(breaks = c(-0.05,0,0.05))
} #Plot linear regression for BLM


plot_results_LIWC <- function(results, xlab, ylab) {
  # Rename the rownames
  results <- results[rownames(results) %in% c("WC_log","emo_pos_log","emo_pos_log","emo_neg_log",
                                              "Analytic","Clout","Authentic"),]
  rownames(results) <- c( "Word count (log)","Positive emotion (log)","Negative emotion (log)",
                          "Analytic","Clout","Authentic")
  # Set the color of the bar and point to dark red if the confidence interval does not include 0, 
  # otherwise set it to dark gray
  results$color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark blue","dark gray")
  results$point_color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark blue","dark gray")
  results$fill_color <- ifelse(results$conf_low > 0 | results$conf_high < 0, "dark blue","dark gray")
  
  ggplot(results,aes(x = rownames(results), 
                     y = coef, fill = fill_color)) +
    geom_errorbar(aes(ymin = conf_low, ymax = conf_high), size = 1, width = 0, color = results$color) +
    geom_point(size = 3, color = results$point_color) +
    theme_minimal() +
    theme(panel.grid = element_blank())+
    geom_hline(yintercept=0, color = "black", size=1)+
    labs(x = xlab, y = ylab) +
    coord_flip() +
    guides(fill = "none")
} #Plot linear regression for regressions incl. LIWC results in RQ3


conv_parser <- function(x) {
  strcapture("(.*?):(.*)", strsplit(gsub(pattern, "\n\\1", x), "\n")[[1]][-1], 
             data.frame(speaker = character(), text = character()))
} #Parse the dialogue into single prompts with speaker (AI or human)
#This function was used to parse the raw conversation data. 
#Raw data is not provided for this replication, and feel free to import the parsed data instead


findexamples <- function(model,text,number, topic){
  findThoughts(model,texts = as.character(text), n = number, topics = topic)$docs[[1]]
} #Find topic model example texts

calculate_topic_prevalence <- function(model) {
  # Extract the topic-document probability matrix
  topic_document_prob <- model$theta
  # Total number of documents
  num_documents <- nrow(topic_document_prob)
  # Sum the probabilities of each topic across all documents
  topic_prevalence <- colSums(topic_document_prob)
  # Calculate the percentage probability of each topic
  topic_prevalence_percentage <- topic_prevalence / num_documents
  return(topic_prevalence_percentage)
}

topic_prev_cc <- function(model, topicnum,xlab, ylab) {
  results <- summary(model)$tables[[topicnum]]
  results <- cbind(varname = rownames(results), manualCI(results))
  rownames(results) <- 1:nrow(results)
  results <- results %>% filter(grepl("minor", varname))
  results$varname <- gsub("raceminorminor", "Race/ethnicity minority", results$varname)
  results$varname <- gsub("langminorminor", "Language minority", results$varname)
  results$varname <- gsub("eduminorminor", "Education minority", results$varname)
  results$varname <- gsub("ccminorminor", "Opinion minority", results$varname)
  
  results$color <- ifelse(results$lwr > 0 | results$upr < 0, "dark red","dark gray")
  results$point_color <- ifelse(results$lwr > 0 | results$upr < 0, "dark red","dark gray")
  results$fill_color <- ifelse(results$lwr > 0 | results$upr < 0, "dark red","dark gray")
  
  ggplot(results,aes(x = varname, 
                     y = est, fill = fill_color)) +
    geom_errorbar(aes(ymin = lwr, ymax = upr), size = 2, width = 0, color = results$color) +
    geom_point(size = 3, color = results$point_color) +
    theme_minimal() +
    theme(panel.grid = element_blank(),
          axis.text.y = element_text(size = 12))+
    geom_hline(yintercept=0, color = "black", size=1)+
    labs(x = xlab, y = ylab) +
    coord_flip() +
    guides(fill = "none")+
    scale_y_continuous(breaks = c(-0.01, 0, 0.01))
}

topic_prev_blm <- function(model, topicnum,xlab, ylab) {
  results <- summary(model)$tables[[topicnum]]
  results <- cbind(varname = rownames(results), manualCI(results))
  rownames(results) <- 1:nrow(results)
  results <- results %>% filter(grepl("minor", varname))
  results$varname <- gsub("raceminorminor", "Race/ethnicity minority", results$varname)
  results$varname <- gsub("langminorminor", "Language minority", results$varname)
  results$varname <- gsub("eduminorminor", "Education minority", results$varname)
  results$varname <- gsub("blmminorminor", "Opinion minority", results$varname)
  
  results$color <- ifelse(results$lwr > 0 | results$upr < 0, "dark red","dark gray")
  results$point_color <- ifelse(results$lwr > 0 | results$upr < 0, "dark red","dark gray")
  results$fill_color <- ifelse(results$lwr > 0 | results$upr < 0, "dark red","dark gray")
  
  ggplot(results,aes(x = varname, 
                     y = est, fill = fill_color)) +
    geom_errorbar(aes(ymin = lwr, ymax = upr), size = 2, width = 0, color = results$color) +
    geom_point(size = 3, color = results$point_color) +
    theme_minimal() +
    theme(panel.grid = element_blank(),
          axis.text.y = element_text(size = 12))+
    geom_hline(yintercept=0, color = "black", size=1)+
    labs(x = xlab, y = ylab) +
    coord_flip() +
    guides(fill = "none")+
    scale_y_continuous(breaks = c(-0.01, 0, 0.01))
}

##### Data Prep #####
###### read data files ###### Please set up your Working Directory where you install the dataset first
df <- read.csv("survey_cleaned.csv") #Qualtrics outputs without ids
df <- df[,-1]

###### merge survey with the Human-AI chat dataset ######
df_chat <- read.csv("chat-all.csv",
                    fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                    row.names = NULL, 
                    stringsAsFactors = FALSE)%>%
  select(-c(X,timestamp))%>%
  rename('code' = 'X_id')

df <- df%>%left_join(df_chat,by="code")

##### Descriptive Data #####
table(df$survey_type)
table(df$gender)
table(df$edu)
table(df$ethnic)
table(df$income)
table(df$language)

table(df$political)
#45.65% conservatives
#17.02% neutral
#37.33% liberals

Sys.setlocale('LC_ALL','C')
df[] <- lapply(df, gsub, pattern = 'Strongly agree', replacement = 5, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Somewhat agree", replacement = 4, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Neither agree nor disagree", replacement = 3, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Somewhat disagree", replacement = 2, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Strongly disagree", replacement = 1, fixed = TRUE)

df[] <- lapply(df, gsub, pattern = "Very liberal", replacement = 5, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Slightly liberal", replacement = 4, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Neutral/ Neither conservative or liberal", replacement = 3, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Slightly conservative", replacement = 2, fixed = TRUE)
df[] <- lapply(df, gsub, pattern = "Very conservative", replacement = 1, fixed = TRUE)

#reversed scales
df$blm_pre_3 <- mgsub(df$blm_pre_3, c(5,4,2,1), c(1,2,4,5))
df$OBLM_post_3 <- mgsub(df$OBLM_post_3, c(5,4,2,1), c(1,2,4,5))
df$IBLM_post_3 <- mgsub(df$IBLM_post_3, c(5,4,2,1), c(1,2,4,5))

#bot experience
psych::alpha(data.matrix(dplyr::select(df,c(bot_pre_1,bot_pre_2,bot_pre_4)))) #0.69

#climate change - pre
psych::alpha(data.matrix(dplyr::select(df,c(cc_pre_1:cc_pre_6)))) #0.86

#climate change - post (info)
psych::alpha(data.matrix(dplyr::select(df,c(ICC_post_1:ICC_post_6)))) #0.86

#climate change - post (opinion)
psych::alpha(data.matrix(dplyr::select(df,c(OCC_post_1:OCC_post_6)))) #0.88

#blm - pre
psych::alpha(data.matrix(dplyr::select(df,c(blm_pre_1:blm_pre_6)))) #0.87

#BLM - post (info)
psych::alpha(data.matrix(dplyr::select(df,c(IBLM_post_1:IBLM_post_6)))) #0.88

#BLM - post (opinion)
psych::alpha(data.matrix(dplyr::select(df,c(OBLM_post_1:OBLM_post_6)))) #0.88

colnames(df)

#satisfaction
psych::alpha(data.matrix(dplyr::select(df,c(satisfaction_1:satisfaction_5)))) #0.89

#rating
psych::alpha(data.matrix(dplyr::select(df,c(rate_1:rate_4)))) #0.89

#learning experience
psych::alpha(data.matrix(dplyr::select(df,c(cc_learn_1:cc_learn_2)))) #0.88
psych::alpha(data.matrix(dplyr::select(df,c(blm_learn_1:blm_learn_2)))) #0.89

#continue
psych::alpha(data.matrix(dplyr::select(df,c(cc_continue_1:cc_continue_3)))) #0.93
psych::alpha(data.matrix(dplyr::select(df,c(blm_continue_1:blm_continue_3)))) #0.94

#recommend
psych::alpha(data.matrix(dplyr::select(df,c(cc_recommend_1:cc_recommend_2)))) #0.92
psych::alpha(data.matrix(dplyr::select(df,c(blm_recommend_1:blm_recommend_2)))) #0.93

###### summarize variables ######
df$age <- as.numeric(df$age)
meansd(df$age)

df$political <- as.numeric(df$political)
meansd(df$political)

df$expect_rounds <- as.numeric(df$expect_rounds)
#boxplot(df$expect_rounds)
expect_r <- df %>% filter(expect_rounds < 1000) %>% select(expect_rounds)
meansd(expect_r$expect_rounds)

df$response_count <- as.numeric(df$response_count)
meansd(df$response_count)

df <- type.convert(df)
sapply(df, class)

data <- df %>%
  mutate(bot_pre = rowMeans(dplyr::select(.,bot_pre_1:bot_pre_4)))%>%
  mutate(cc_pre = rowMeans(dplyr::select(.,cc_pre_1:cc_pre_6)))%>%
  mutate(blm_pre = rowMeans(dplyr::select(.,blm_pre_1:blm_pre_6)))%>%
  mutate(ICC_post = rowMeans(dplyr::select(.,ICC_post_1:ICC_post_6)))%>%
  mutate(OCC_post = rowMeans(dplyr::select(.,OCC_post_1:OCC_post_6)))%>%
  mutate(IBLM_post = rowMeans(dplyr::select(.,IBLM_post_1:IBLM_post_6)))%>%
  mutate(OBLM_post = rowMeans(dplyr::select(.,OBLM_post_1:OBLM_post_6)))%>%
  mutate(satisfaction = rowMeans(dplyr::select(.,satisfaction_1:satisfaction_5)))%>%
  mutate(rate = rowMeans(dplyr::select(.,rate_1:rate_4)))%>%
  mutate(cc_learn = rowMeans(dplyr::select(.,cc_learn_1:cc_learn_2)))%>%
  mutate(cc_continue = rowMeans(dplyr::select(.,cc_continue_1:cc_continue_3)))%>%
  mutate(cc_recommend = rowMeans(dplyr::select(.,cc_recommend_1:cc_recommend_2)))%>%
  mutate(blm_learn = rowMeans(dplyr::select(.,blm_learn_1:blm_learn_2)))%>%
  mutate(blm_continue = rowMeans(dplyr::select(.,blm_continue_1:blm_continue_3)))%>%
  mutate(blm_recommend = rowMeans(dplyr::select(.,blm_recommend_1:blm_recommend_2)))%>%
  mutate(pre = paste(cc_pre , blm_pre))%>%
  mutate(post = paste(ICC_post , OCC_post , IBLM_post , OBLM_post))%>%
  mutate(eduminor = ifelse(edu == "Highschool"|edu == "Other (please specify):","minor","major"))%>%
  mutate(raceminor = ifelse(ethnic == "White","major","minor"))%>%
  mutate(raceminor_blm = ifelse(ethnic == "White","white",
                                ifelse(ethnic == " Black or African American","black","other")))%>%
  mutate(langminor = ifelse(language == "English","major","minor"))%>%
  mutate(ideology = ifelse(political == 5 | political == 4,"liberal",
                           ifelse(political == 3,"neutral","conservative")))%>%
  select(c(EndDate:political,want:check,response_count,survey_type,prompt,bot_pre:blm_recommend,pre,post,
           eduminor,raceminor,raceminor_blm,langminor,ideology))

#data <- data %>% filter(gender == "Female" | gender == "Male")

data$post <- gsub(' ', '', data$post)
data$post <- gsub('\\NA','',data$post)
data$post <- as.numeric(data$post)
data$pre <- gsub(' ', '', data$pre)
data$pre <- gsub('\\NA','',data$pre)
data$pre <- as.numeric(data$pre)
data$change <- data$post - data$pre

data_ICC <- data %>% filter(survey_type == "info-climate")
data_OCC <- data %>% filter(survey_type == "opinion-climate")
data_IBLM <- data %>% filter(survey_type == "info-blm")
data_OBLM <- data %>% filter(survey_type == "opinion-blm")

data_cc <- data %>% filter(survey_type == "info-climate" | survey_type == "opinion-climate")
#summary(data_cc$cc_pre)
data_cc <- data_cc %>%
  mutate(ccminor = ifelse(cc_pre < 3.833,"minor","major"))
table(data_cc$ccminor)

data_blm <- data %>% filter(survey_type ==  "info-blm" | survey_type == "opinion-blm")
#summary(data_blm$blm_pre)
data_blm <- data_blm %>%
  mutate(blmminor = ifelse(blm_pre < 3.167,"minor","major"))

###### data wrangling for conversations ######
df_chat <- NULL
df_chat <- data %>% select(EndDate,prompt,want,response_count,survey_type)
df_chat$prompt <- gsub('The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.\n', '', df_chat$prompt)
df_chat <- cbind(index = rownames(df_chat), df_chat)
rownames(df_chat) <- 1:nrow(df_chat)
write.csv(df_chat,"chat_cleaned.csv")

df_chat_cc <- df_chat %>% filter(survey_type == "info-climate" | survey_type == "opinion-climate")
df_chat_blm <- df_chat %>% filter(survey_type == "info-blm" | survey_type == "opinion-blm")

speakers <- c("Human", "AI")
pattern <- paste0("(", paste0(speakers, ":", collapse = "|"), ")")

##### User (Human) Language Differences (LIWC) #####
blm_human_LIWC <- read.csv("LIWC-22 Results - conv_blm_human - LIWC Analysis.csv",
                           fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                           row.names = NULL, 
                           stringsAsFactors = FALSE)
cc_human_LIWC <- read.csv("LIWC-22 Results - conv_cc_human - LIWC Analysis.csv",
                          fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                          row.names = NULL, 
                          stringsAsFactors = FALSE)

cc_human_LIWC <- cc_human_LIWC %>%
  group_by(index) %>%
  summarise(across(WC:OtherP, mean))
cc_human_LIWC$index <- as.character(cc_human_LIWC$index)
cc_human_LIWC <- inner_join(cc_human_LIWC,data_cc,by="index")

cc_human_LIWC$WC_log <- log(cc_human_LIWC$WC + 1)
cc_human_LIWC$emo_pos_log <- log(cc_human_LIWC$emo_pos + 1)
cc_human_LIWC$emo_neg_log <- log(cc_human_LIWC$emo_neg + 1)
cc_human_LIWC$emo_anx_log <- log(cc_human_LIWC$emo_anx + 1)
cc_human_LIWC$emo_anger_log <- log(cc_human_LIWC$emo_anger + 1)
cc_human_LIWC$swear_log <- log(cc_human_LIWC$swear + 1)

blm_human_LIWC <- blm_human_LIWC %>%
  group_by(index) %>%
  summarise(across(WC:OtherP, mean))
blm_human_LIWC$index <- as.character(blm_human_LIWC$index)
blm_human_LIWC <- inner_join(blm_human_LIWC,data_blm_read,by="index")

blm_human_LIWC$WC_log <- log(blm_human_LIWC$WC + 1)
blm_human_LIWC$emo_pos_log <- log(blm_human_LIWC$emo_pos + 1)
blm_human_LIWC$emo_neg_log <- log(blm_human_LIWC$emo_neg + 1)
blm_human_LIWC$emo_anx_log <- log(blm_human_LIWC$emo_anx + 1)
blm_human_LIWC$emo_anger_log <- log(blm_human_LIWC$emo_anger + 1)
blm_human_LIWC$swear_log <- log(blm_human_LIWC$swear + 1)

#start regression

#WC_log+ emo_pos_log+ emo_neg_log+ emo_anx_log+
#Analytic+ Clout+Authentic

#gender + age + income + bot_pre + raceminor + 
#langminor + eduminor + ideology +  blmminor

cc_human_LIWC$ideology <- 
  relevel(factor(cc_human_LIWC$ideology), ref = "neutral")
blm_human_LIWC$ideology <- 
  relevel(factor(blm_human_LIWC$ideology), ref = "neutral")   

cchuman_1 <- lm(WC_log ~ gender + age + income + bot_pre + raceminor + 
                  langminor + eduminor + ideology +  ccminor,data = cc_human_LIWC)
cchuman_2 <- lm(emo_pos_log ~ gender + age + income + bot_pre + raceminor + 
                  langminor + eduminor + ideology +  ccminor,data = cc_human_LIWC)
cchuman_3 <- lm(emo_neg_log ~ gender + age + income + bot_pre + raceminor + 
                  langminor + eduminor + ideology +  ccminor,data = cc_human_LIWC)
cchuman_4 <- lm(emo_anx_log ~ gender + age + income + bot_pre + raceminor + 
                  langminor + eduminor + ideology +  ccminor,data = cc_human_LIWC)
cchuman_5 <- lm(Analytic ~ gender + age + income + bot_pre + raceminor + 
                  langminor + eduminor + ideology +  ccminor,data = cc_human_LIWC)
cchuman_6 <- lm(Clout ~ gender + age + income + bot_pre + raceminor + 
                  langminor + eduminor + ideology +  ccminor,data = cc_human_LIWC)
cchuman_7 <- lm(Authentic ~ gender + age + income + bot_pre + raceminor + 
                  langminor + eduminor + ideology +  ccminor,data = cc_human_LIWC)

blmhuman_1 <- lm(WC_log ~ gender + age + income + bot_pre + raceminor + 
                   langminor + eduminor + ideology +  blmminor,data = blm_human_LIWC)
blmhuman_2 <- lm(emo_pos_log ~ gender + age + income + bot_pre + raceminor + 
                   langminor + eduminor + ideology +  blmminor,data = blm_human_LIWC)
blmhuman_3 <- lm(emo_neg_log ~ gender + age + income + bot_pre + raceminor + 
                   langminor + eduminor + ideology +  blmminor,data = blm_human_LIWC)
blmhuman_4 <- lm(emo_anx_log ~ gender + age + income + bot_pre + raceminor + 
                   langminor + eduminor + ideology +  blmminor,data = blm_human_LIWC)
blmhuman_5 <- lm(Analytic ~ gender + age + income + bot_pre + raceminor + 
                   langminor + eduminor + ideology +  blmminor,data = blm_human_LIWC)
blmhuman_6 <- lm(Clout ~ gender + age + income + bot_pre + raceminor + 
                   langminor + eduminor + ideology +  blmminor,data = blm_human_LIWC)
blmhuman_7 <- lm(Authentic ~ gender + age + income + bot_pre + raceminor + 
                   langminor + eduminor + ideology +  blmminor,data = blm_human_LIWC)


stargazer(cchuman_1,cchuman_2,cchuman_3,cchuman_4,cchuman_5,cchuman_6,cchuman_7,
          align = TRUE,out = "humanLIWCfeature_cc.html")

stargazer(blmhuman_1,blmhuman_2,blmhuman_3,blmhuman_4,blmhuman_5,blmhuman_6,blmhuman_7,
          align = TRUE,out = "humanLIWCfeature_blm.html")


##### [RQ1] User Experience Gap #####
#Our IV will be demographic features of the participants (e.g., race, gender,edu). 
#Our CV will be user's prior experiences with chatbot, user's language styles (Based on the session "User (Human) Language Differences (LIWC)"). 
#This finding will demonstrate where the user experiences gap lies for conversational AI.

cc_human_LIWC$ideology <- 
  relevel(factor(cc_human_LIWC$ideology), ref = "neutral")
blm_human_LIWC$ideology <- 
  relevel(factor(blm_human_LIWC$ideology), ref = "neutral")

###### > [Figure 2A]ols regressions & visualization - Climate Change ######
lm_change <- lm(change ~ gender + age + income + bot_pre + 
                  raceminor + eduminor + langminor + ccminor + ideology +  
                  WC_log+ emo_pos_log+ emo_neg_log+ 
                  Analytic+ Clout+Authentic,data = cc_human_LIWC)

lm_rate <- lm(rate ~ gender + age + income + bot_pre +
                raceminor + langminor + eduminor + ideology +
                ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                Analytic+ Clout+Authentic,data = cc_human_LIWC)

lm_satisfaction <- lm(satisfaction ~ gender + age + income + bot_pre +
                        raceminor + langminor + eduminor + ideology +
                        ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                        Analytic+ Clout+Authentic,data = cc_human_LIWC)

lm_learn <- lm(cc_learn ~ gender + age + income + bot_pre +
                 raceminor + langminor + eduminor + ideology +
                 ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                 Analytic+ Clout+Authentic,data = cc_human_LIWC)

lm_continue <- lm(cc_continue ~ gender + age + income + bot_pre +
                    raceminor + langminor + eduminor + ideology +
                    ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                    Analytic+ Clout+Authentic,data = cc_human_LIWC)

lm_recommend <- lm(cc_recommend ~ gender + age + income + bot_pre +
                     raceminor + langminor + eduminor + ideology +
                     ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                     Analytic+ Clout+Authentic,data = cc_human_LIWC)

#Visualization
results1 <- extract_results(lm_change)
results2 <- extract_results(lm_rate)
results3 <- extract_results(lm_satisfaction)
results4 <- extract_results(lm_learn)
results5 <- extract_results(lm_continue)
results6 <- extract_results(lm_recommend)

results1 <- results1[c("ccminor", "eduminor", "raceminor", "langminor"),]
results1$DV <- "Change in Attitude"

results2 <- results2[c("ccminor", "eduminor", "raceminor", "langminor"),]
results2$DV <- "Rating of GPT-3"

results3 <- results3[c("ccminor", "eduminor", "raceminor", "langminor"),]
results3$DV <- "Satisfaction of GPT-3"

results4 <- results4[c("ccminor", "eduminor", "raceminor", "langminor"),]
results4$DV <- "Learning Experience with the GPT-3"

results5 <- results5[c("ccminor", "eduminor", "raceminor", "langminor"),]
results5$DV <- "Intention to Continue Chat"

results6 <- results6[c("ccminor", "eduminor", "raceminor", "langminor"),]
results6$DV <- "Intention to Recommend to Others"

combined_df <- rbind(results1, results2, results3, results4, results5, results6)

# filter data by row names to create subsets for each group
ccminor <- combined_df[grepl("ccminorminor", rownames(combined_df)),]
eduminor <- combined_df[grepl("eduminorminor", rownames(combined_df)),]
raceminor <- combined_df[grepl("raceminorminor", rownames(combined_df)),]
langminor <- combined_df[grepl("langminorminor", rownames(combined_df)),]

combined_df <- add_color(combined_df)

# apply add_color function to each subset
ccminor <- add_color(ccminor)
eduminor <- add_color(eduminor)
raceminor <- add_color(raceminor)
langminor <- add_color(langminor)

p0 <- ggplot(ccminor, aes(x = factor(DV), y = coef)) +
  geom_blank() +
  coord_flip() +
  scale_y_continuous(limits = c(-1.5, 1.5)) +
  theme(axis.line = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        plot.background = element_blank(),
        axis.text.y = element_text(size = 12)) +
  labs(x = "", y = "Coefficient") +
  guides(fill = FALSE)


p1 <- plot_subset_noy(combined_df, "ccminorminor") + ggtitle("Opinion minority")
p2 <- plot_subset_noy(combined_df, "eduminorminor") + ggtitle("Education minority")
p3 <- plot_subset_noy(combined_df, "raceminorminor") + ggtitle("Race/ethnicity minority")
p4 <- plot_subset_noy(combined_df, "langminorminor") + ggtitle("Language minority")

grid.arrange(p0, p1, p2, p3, p4, 
             ncol = 5,
             top=textGrob("Panel A: Climate Change",x = 0, hjust = 0))
p_rq1_cc <- grid.arrange(p0, p1, p2, p3, p4, 
                         ncol = 5,
                         top=textGrob("Panel A: Climate Change",x = 0, hjust = 0,
                                      gp = gpar(fontsize = 18)))

###### > [Figure 2B]ols regressions & visualization - BLM ######
lm_change <- lm(change ~ gender + age + income + bot_pre + 
                  raceminor + eduminor + langminor + blmminor + ideology +  
                  WC_log+ emo_pos_log+ emo_neg_log+ 
                  Analytic+ Clout+Authentic,data = blm_human_LIWC)

lm_rate <- lm(rate ~ gender + age + income + bot_pre +
                raceminor + eduminor + langminor + blmminor + ideology +
                WC_log+ emo_pos_log+ emo_neg_log+
                Analytic+ Clout+Authentic,data = blm_human_LIWC)

lm_satisfaction <- lm(satisfaction ~ gender + age + income + bot_pre +
                        raceminor + eduminor + langminor + blmminor + ideology +
                        WC_log+ emo_pos_log+ emo_neg_log+
                        Analytic+ Clout+Authentic,data = blm_human_LIWC)

lm_learn <- lm(blm_learn ~ gender + age + income + bot_pre +
                 raceminor + eduminor + langminor + blmminor + ideology +
                 WC_log+ emo_pos_log+ emo_neg_log+
                 Analytic+ Clout+Authentic,data = blm_human_LIWC)

lm_continue <- lm(blm_continue ~ gender + age + income + bot_pre +
                    raceminor + eduminor + langminor + blmminor +ideology +
                    WC_log+ emo_pos_log+ emo_neg_log+
                    Analytic+ Clout+Authentic,data = blm_human_LIWC)

lm_recommend <- lm(blm_recommend ~ gender + age + income + bot_pre +
                     raceminor + eduminor + langminor + blmminor + ideology +
                     WC_log+ emo_pos_log+ emo_neg_log+
                     Analytic+ Clout+Authentic,data = blm_human_LIWC)

#Visualization
results1 <- extract_results(lm_change)
results2 <- extract_results(lm_rate)
results3 <- extract_results(lm_satisfaction)
results4 <- extract_results(lm_learn)
results5 <- extract_results(lm_continue)
results6 <- extract_results(lm_recommend)

results1 <- results1[c("blmminor", "eduminor", "raceminor", "langminor"),]
results1$DV <- "Change in Attitude"

results2 <- results2[c("blmminor", "eduminor", "raceminor", "langminor"),]
results2$DV <- "Rating of GPT-3"

results3 <- results3[c("blmminor", "eduminor", "raceminor", "langminor"),]
results3$DV <- "Satisfaction of GPT-3"

results4 <- results4[c("blmminor", "eduminor", "raceminor", "langminor"),]
results4$DV <- "Learning Experience with the GPT-3"

results5 <- results5[c("blmminor", "eduminor", "raceminor", "langminor"),]
results5$DV <- "Intention to Continue Chat"

results6 <- results6[c("blmminor", "eduminor", "raceminor", "langminor"),]
results6$DV <- "Intention to Recommend to Others"

combined_df <- rbind(results1, results2, results3, results4, results5, results6)

# filter data by row names to create subsets for each group
blmminor <- combined_df[grepl("blmminorminor", rownames(combined_df)),]
eduminor <- combined_df[grepl("eduminorminor", rownames(combined_df)),]
raceminor <- combined_df[grepl("raceminorminor", rownames(combined_df)),]
langminor <- combined_df[grepl("langminorminor", rownames(combined_df)),]


combined_df <- add_color(combined_df)

plot_subset(combined_df, "blmminorminor")

p1 <- plot_subset_noy(combined_df, "blmminorminor")
p2 <- plot_subset_noy(combined_df, "eduminorminor")
p3 <- plot_subset_noy(combined_df, "raceminorminor")
p4 <- plot_subset_noy(combined_df, "langminorminor")

grid.arrange(p0, p1, p2, p3, p4, 
             ncol = 5,
             top=textGrob("Panel B: Black Lives Matter",x = 0, hjust = 0,
                          gp = gpar(fontsize = 18)))

p_rq1_blm <- grid.arrange(p0, p1, p2, p3, p4, 
                          ncol = 5,
                          top=textGrob("Panel B: Black Lives Matter",x = 0, hjust = 0,
                                       gp = gpar(fontsize = 18)))

grid.arrange(p_rq1_cc,p_rq1_blm,ncol=1)

###### > quantile regressions ######
#cc-rating
qr_rate <- rq(rate ~ gender + age + income + bot_pre +
                raceminor + langminor + eduminor + ideology +
                ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                Analytic+ Clout+Authentic,data = cc_human_LIWC, tau = qs)

#cc-satisfaction
qr_satisfaction <- rq(satisfaction ~ gender + age + income + bot_pre +
                        raceminor + langminor + eduminor + ideology +
                        ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                        Analytic+ Clout+Authentic,data = cc_human_LIWC, tau = qs)

#cc-learning
qr_learn <- rq(cc_learn ~ gender + age + income + bot_pre +
                 raceminor + langminor + eduminor + ideology +
                 ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                 Analytic+ Clout+Authentic,data = cc_human_LIWC, tau = qs)

#cc-continue
qr_continue <- rq(cc_continue ~ gender + age + income + bot_pre +
                    raceminor + langminor + eduminor + ideology +
                    ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                    Analytic+ Clout+Authentic,data = cc_human_LIWC, tau = qs)

#cc-recommend
qr_recommend <- rq(cc_recommend ~ gender + age + income + bot_pre +
                     raceminor + langminor + eduminor + ideology +
                     ccminor + WC_log+ emo_pos_log+ emo_neg_log+
                     Analytic+ Clout+Authentic,data = cc_human_LIWC, tau = qs)
#blm-rating
qr_rate <- rq(rate ~ gender + age + income + bot_pre +
                raceminor + langminor + eduminor + ideology +
                blmminor + WC_log+ emo_pos_log+ emo_neg_log+
                Analytic+ Clout+Authentic,data = blm_human_LIWC, tau = qs)

#blm-satisfaction
qr_satisfaction <- rq(satisfaction ~ gender + age + income + bot_pre +
                        raceminor + langminor + eduminor + ideology +
                        blmminor + WC_log+ emo_pos_log+ emo_neg_log+
                        Analytic+ Clout+Authentic,data = blm_human_LIWC, tau = qs)

#blm-learning
qr_learn <- rq(blm_learn ~ gender + age + income + bot_pre +
                 raceminor + langminor + eduminor + ideology +
                 blmminor + WC_log+ emo_pos_log+ emo_neg_log+
                 Analytic+ Clout+Authentic,data = blm_human_LIWC, tau = qs)

#blm-continue
qr_continue <- rq(blm_continue ~ gender + age + income + bot_pre +
                    raceminor + langminor + eduminor + ideology +
                    blmminor + WC_log+ emo_pos_log+ emo_neg_log+
                    Analytic+ Clout+Authentic,data = blm_human_LIWC, tau = qs)

#blm-recommend
qr_recommend <- rq(blm_recommend ~ gender + age + income + bot_pre +
                     raceminor + langminor + eduminor + ideology +
                     blmminor + WC_log+ emo_pos_log+ emo_neg_log+
                     Analytic+ Clout+Authentic,data = blm_human_LIWC, tau = qs)

##### [RQ2] Differences in Chatbot Responses ####
data <- cbind(index = rownames(data), data)
rownames(data) <- 1:nrow(data)
#data[1] <- NULL

data_topic <- right_join(data,dat,by = "index")

all_human <- dplyr::filter(data_topic, grepl("Human",speaker))
all_bot <- dplyr::filter(data_topic, grepl("AI",speaker))

###### > topic modeling - climate change ######
#Requires around 30 minutes depending on your RAM
dat_cc <- read.csv("chat_wrangled_cc.csv")
dat_blm <- read.csv("chat_wrangled_blm.csv")

data_topic_cc <- right_join(data_cc,dat_cc,by = "index")
cc_human <- dplyr::filter(data_topic_cc, grepl("Human",speaker))
cc_bot <- dplyr::filter(data_topic_cc, grepl("AI",speaker))

data_topic_blm <- right_join(data_blm,dat_blm,by = "index")
blm_human <- dplyr::filter(data_topic_blm, grepl("Human",speaker))
blm_bot <- dplyr::filter(data_topic_blm, grepl("AI",speaker))


cc_bot$round <- as.integer(cc_bot$round)
cc_bot <- cc_bot %>% drop_na(round)
processed_cc <- textProcessor(cc_bot$text, metadata = cc_bot)
out_cc <- prepDocuments(processed_cc$documents, processed_cc$vocab, processed_cc$meta)
cc_bot_topic <- stm(out_cc$documents, out_cc$vocab,seed = 1000,
                    K=10, prevalence = ~ round + age + income + ideology + bot_pre +
                      raceminor + eduminor + langminor + ccminor,
                    max.em.its = 500,
                    data = out_cc$meta,
                    init.type = "Spectral")
summary(cc_bot_topic)
labelTopics(cc_bot_topic,n = 10)

calculate_topic_prevalence(cc_bot_topic)

cc_bot_topic_effect <- 
  estimateEffect(1:10 ~ round + age + income + ideology + bot_pre +
                   raceminor + eduminor + langminor + ccminor,
                 bot_cc, meta = out_cc$meta,uncertainty = "Global")

cc_bot_topic_summary <-summary(cc_bot_topic_effect)

#Topic 9 coefficient on eduminor: 0.011. Topic prevalence: 0.096
0.011/0.096 = 0.115 #11.5% more likely
#Topic 10 coefficient on eduminor: -0.007, on ccminor: 0.005. Topic prevalence: 0.101
0.007/0.101 = 0.069 #6.9% less likely
0.005/0.101 = 0.050 #5.0% more likely

coef_table_list <- lapply(cc_bot_topic_summary$tables, as.data.frame)
coef_table <- do.call(rbind, coef_table_list)
coef_table$model <- rep(1:length(cc_bot_topic_summary$tables), each = nrow(coef_table_list[[1]]))
coef_table$stars <- ""

for(i in 1:nrow(coef_table)) {
  p_value <- coef_table[i, "Pr(>|t|)"]
  coef_table[i,"Estimate"] <- round(as.numeric(as.character(coef_table[i,"Estimate"])),3)
  coef_table[i,"Std. Error"] <- round(as.numeric(as.character(coef_table[i,"Std. Error"])),3)
  if(p_value < 0.001) {
    coef_table[i, "stars"] <- "***"
  } else if(p_value < 0.01) {
    coef_table[i, "stars"] <- "**"
  } else if(p_value < 0.05) {
    coef_table[i, "stars"] <- "*"
  }
  coef_table[i,"Estimate"] <- paste0(coef_table[i,"Estimate"], "(", coef_table[i,"Std. Error"],")",coef_table[i,"stars"])
}
coef_table$Estimate <- as.character(coef_table$Estimate)
coef_table$Std.Error <- NULL
coef_table$`Pr(>|t|)` <- NULL

coef_table_minor <- coef_table_minor %>% select (c(Estimate,model))

write.csv(coef_table_minor,"coef_table_minor_cc.csv")

###### > topic modeling - BLM ######
#Requires around 30 minutes depending on your RAM
blm_bot <- blm_bot %>% 
  drop_na(blmminor)%>%
  drop_na(bot_pre)

blm_bot$round <- as.integer(blm_bot$round)
blm_bot <- blm_bot %>% drop_na(round)

processed_blm <- textProcessor(blm_bot$text, metadata = blm_bot)
out_blm <- prepDocuments(processed_blm$documents, processed_blm$vocab, processed_blm$meta)
blm_bot_topic <- stm(out_blm$documents, out_blm$vocab,seed = 1000,
                     K=10, prevalence = ~ round + age + income + ideology + bot_pre +
                       raceminor + eduminor + langminor + blmminor,
                     max.em.its = 500,
                     data = out_blm$meta,
                     init.type = "Spectral")

summary(blm_bot_topic)
labelTopics(blm_bot_topic,n = 50)
plot(blm_bot_topic, type = "summary", xlim = c(0, 0.75), labeltype = "frex", 
     n = 10, main = "Topic models of chatbot responses on BLM")

calculate_topic_prevalence(blm_bot_topic)

blm_bot_topic_effect <- estimateEffect(1:10 ~ round + age + income + ideology + bot_pre +
                                         raceminor + eduminor + langminor + blmminor, 
                                       blm_bot_topic, meta = out_blm$meta,uncertainty = "Global")

summary(blm_bot_topic_effect)

topic_prevalence <- blm_bot_topic_effect$beta * blm_bot_topic_effect$totals

blm_bot_topic_summary <-summary(blm_bot_topic_effect)
coef_table_list <- lapply(blm_bot_topic_summary$tables, as.data.frame)
coef_table <- do.call(rbind, coef_table_list)
coef_table$model <- rep(1:length(blm_bot_topic_summary$tables), each = nrow(coef_table_list[[1]]))
coef_table$stars <- ""

for(i in 1:nrow(coef_table)) {
  p_value <- coef_table[i, "Pr(>|t|)"]
  coef_table[i,"Estimate"] <- round(as.numeric(as.character(coef_table[i,"Estimate"])),3)
  coef_table[i,"Std. Error"] <- round(as.numeric(as.character(coef_table[i,"Std. Error"])),3)
  if(p_value < 0.001) {
    coef_table[i, "stars"] <- "***"
  } else if(p_value < 0.01) {
    coef_table[i, "stars"] <- "**"
  } else if(p_value < 0.05) {
    coef_table[i, "stars"] <- "*"
  }
  coef_table[i,"Estimate"] <- paste0(coef_table[i,"Estimate"], "(", coef_table[i,"Std. Error"],")",coef_table[i,"stars"])
}
coef_table$Estimate <- as.character(coef_table$Estimate)
coef_table$Std.Error <- NULL
coef_table$`Pr(>|t|)` <- NULL

coef_table_minor <- subset(coef_table, grepl("minor", rownames(coef_table)))
coef_table_minor <- coef_table_minor %>% select (c(Estimate,model))

write.csv(coef_table_minor,"coef_table_minor_blm.csv")

###### > [Figure 4 A-1 & B-1]plotting topic prevalence ######
plot1 <- topic_prev_cc(cc_bot_topic_effect,9,
                    xlab = "",
                    ylab = "\nTopic 9: Citation of scientific research")+
  theme(plot.margin = unit(c(0.25,0.25,0.25,0.25), "cm"))
plot2 <- topic_prev_cc(cc_bot_topic_effect,10,
                    xlab = "",
                    ylab = "\nTopic 10: Reference to external links")+
  theme(axis.text.y = element_blank())+
  theme(plot.margin = unit(c(0.25,0.25,0.25,0.25), "cm"))

grid.arrange(plot1, plot2,ncol = 2)

plot1 <- topic_prev_blm(blm_bot_topic_effect,5,
                    xlab = "",
                    ylab = "\nTopic 5: Disagreements with few justification")+
  theme(plot.margin = unit(c(0.25,0.25,0.25,0.25), "cm"))
plot2 <- topic_prev_blm(blm_bot_topic_effect,7,
                    xlab = "",
                    ylab = "\nTopic 7: Unable to understand human's question")+
  theme(axis.text.y = element_blank())+
  theme(plot.margin = unit(c(0.25,0.25,0.25,0.25), "cm"))
grid.arrange(plot1, plot2,ncol = 2)

###### > [Figure 4 A-2 & B-2]chatbot LIWC differences ######
# climate change 
LIWC_cc <- read.csv("LIWC-22 Results - conv_cc_bot - LIWC Analysis.csv",
                    fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                    row.names = NULL, 
                    stringsAsFactors = FALSE)
#LIWC_cc <- type.convert(LIWC_cc)
LIWC_cc <- LIWC_cc %>%
  group_by(index) %>%
  summarise(across(WC:OtherP, mean))
LIWC_cc$index <- as.character(LIWC_cc$index)
LIWC_cc <- inner_join(LIWC_cc,data_cc_read,by="index")

LIWC_cc <- LIWC_cc %>% select(c(index, WC, emo_pos, emo_neg, 
                                Analytic, Clout,Authentic, 
                                gender , age ,  income , bot_pre , raceminor , 
                                langminor , eduminor , ideology , ccminor))

LIWC_cc$WC_log <- log(LIWC_cc$WC + 1)
LIWC_cc$emo_pos_log <- log(LIWC_cc$emo_pos + 1)
LIWC_cc$emo_neg_log <- log(LIWC_cc$emo_neg + 1)

LIWC_cc <- LIWC_cc %>% select(c(index, WC_log, emo_pos_log, emo_neg_log,
                                Analytic, Clout,Authentic, 
                                gender , age ,  income , bot_pre , raceminor , 
                                langminor , eduminor , ideology , ccminor))

LIWC_cc$ideology <- 
  relevel(factor(LIWC_cc$ideology), ref = "neutral")

out_LIWC_cc <- lapply(2:7, function(x) 
  lm(as.numeric(unlist(LIWC_cc[,x])) ~ gender + age +  income + bot_pre + raceminor + 
       langminor + eduminor + ideology +  ccminor, LIWC_cc))

stargazer(out_LIWC_cc,align = TRUE, 
          out = "LIWC_cc_chatbot.html")

lm1 <- lm(WC_log ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  ccminor,LIWC_cc)
lm2 <- lm(emo_pos_log ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  ccminor,LIWC_cc)
lm3 <- lm(emo_neg_log ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  ccminor,LIWC_cc)
lm4 <- lm(Analytic ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  ccminor,LIWC_cc)
lm5 <- lm(Clout ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  ccminor,LIWC_cc)
lm6 <- lm(Authentic ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  ccminor,LIWC_cc)

stargazer(lm1,lm2,lm3,lm4,lm5,lm6,align = TRUE, 
          out = "LIWC_cc_chatbot.html")

results1 <- extract_results(lm1)
results2 <- extract_results(lm2)
plot1 <- plot_results_cc(results1, xlab = "", ylab = "Coefficient (Number of words)")+
  theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))
plot2 <- plot_results_cc(results2, xlab = "", ylab = "Coefficient (Positive emotion)")+
  theme(axis.text.y = element_blank())+
  theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))

grid.arrange(plot1, plot2, ncol = 2)

#BLM
LIWC_blm <- read.csv("LIWC-22 Results - conv_blm_bot - LIWC Analysis.csv",
                     fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                     row.names = NULL, 
                     stringsAsFactors = FALSE)
#LIWC_blm <- type.convert(LIWC_blm)
LIWC_blm <- LIWC_blm %>%
  group_by(index) %>%
  summarise(across(WC:OtherP, mean))
LIWC_blm$index <- as.character(LIWC_blm$index)
LIWC_blm <- inner_join(LIWC_blm,data_blm,by="index")

LIWC_blm <- LIWC_blm %>% select(c(index, WC, emo_pos, emo_neg, 
                                  Analytic, Clout,Authentic, 
                                  gender , age ,  income , bot_pre , raceminor , 
                                  langminor , eduminor , ideology , blmminor))

LIWC_blm$WC_log <- log(LIWC_blm$WC + 1)
LIWC_blm$emo_pos_log <- log(LIWC_blm$emo_pos + 1)
LIWC_blm$emo_neg_log <- log(LIWC_blm$emo_neg + 1)

LIWC_blm <- LIWC_blm %>% select(c(index, WC_log, emo_pos_log, emo_neg_log,
                                  Analytic, Clout,Authentic, 
                                  gender , age ,  income , bot_pre , raceminor , 
                                  langminor , eduminor , ideology , blmminor))


LIWC_blm$ideology <- 
  relevel(factor(LIWC_blm$ideology), ref = "neutral")

out_LIWC_blm <- lapply(2:7, function(x) 
  lm(as.numeric(unlist(LIWC_blm[,x])) ~ gender + age +  income + bot_pre + raceminor + 
       langminor + eduminor + ideology +  blmminor, LIWC_blm))

stargazer(out_LIWC_blm,align = TRUE, 
          out = "LIWC_blm_chatbot.html")

out_LIWC_cc <- lapply(2:7, function(x) 
  lm(as.numeric(unlist(LIWC_cc[,x])) ~ gender + age +  income + bot_pre + raceminor + 
       langminor + eduminor + ideology +  ccminor, LIWC_cc))

LIWC_blm$Analytic_1 <- LIWC_blm$Analytic/50
LIWC_blm$Clout_1 <- LIWC_blm$Clout/50
LIWC_blm$Authentic_1 <- LIWC_blm$Authentic/50


lm1 <- lm(WC_log ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  blmminor,LIWC_blm)
lm2 <- lm(emo_pos_log ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  blmminor,LIWC_blm)
lm3 <- lm(emo_neg_log ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  blmminor,LIWC_blm)
lm4 <- lm(Analytic ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  blmminor,LIWC_blm)
lm5 <- lm(Clout ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  blmminor,LIWC_blm)
lm6 <- lm(Authentic ~ gender + age +  income + bot_pre + raceminor + 
            langminor + eduminor + ideology +  blmminor,LIWC_blm)

stargazer(lm1,lm2,lm3,lm4,lm5,lm6,align = TRUE, 
          out = "LIWC_blm_chatbot.html")

results1 <- extract_results(lm1)
results2 <- extract_results(lm3)
results3 <- extract_results(lm6)

plot1 <- plot_results_blm(results1, xlab = "", ylab = "Coefficient (Number of words)")+
  theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))
plot2 <- plot_results_blm(results2, xlab = "", ylab = "Coefficient (Negative emotion)")+
  theme(axis.text.y = element_blank())+
  theme(plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))

grid.arrange(plot1, plot2, ncol = 2)


##### [RQ3] User Experience Gap with Bot Language Features #####
###### > [Figure 5 A]standardized for visualization - climate change #######
LIWC_cc_std <- LIWC_cc %>% 
  select(c(raceminor ,langminor , eduminor , ccminor , ideology,
           WC_log, emo_pos_log, emo_neg_log ,Analytic, Clout, Authentic ,
           gender , age , income , bot_pre))%>%
  mutate_if(is.numeric, scale)
col_names <- colnames(LIWC_cc_std)
col_names <- gsub("\\[,1\\]", "", col_names)
colnames(LIWC_cc_std) <- col_names
LIWC_cc_rst <- LIWC_cc %>%
  select(c(change,rate,satisfaction,cc_learn,cc_continue,cc_recommend))
LIWC_cc_std <- cbind(LIWC_cc_std, LIWC_cc_rst)

lm_change_cc <- lm(change  ~ raceminor +langminor + eduminor + ccminor + ideology +
                     WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                     gender + age + income + bot_pre,
                   LIWC_cc_std)
lm_rate_cc <- lm(rate ~ raceminor +langminor + eduminor + ccminor + ideology +
                WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                gender + age + income + bot_pre,
              LIWC_cc_std)
lm_satisfaction_cc <- lm(satisfaction ~ raceminor +langminor + eduminor + ccminor + ideology +
                        WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                        gender + age + income + bot_pre,
                      LIWC_cc_std)
lm_learn_cc <- lm(cc_learn ~ raceminor +langminor + eduminor + ccminor + ideology +
                 WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                 gender + age + income + bot_pre,
               LIWC_cc_std)
lm_continue_cc <- lm(cc_continue ~ raceminor +langminor + eduminor + ccminor + ideology +
                    WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                    gender + age + income + bot_pre,
                  LIWC_cc_std)
lm_recommend_cc <- lm(cc_recommend ~ raceminor +langminor + eduminor + ccminor + ideology +
                     WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                     gender + age + income + bot_pre,
                   LIWC_cc_std)

results <- extract_results(lm_change_cc)
results <- results[rownames(results) %in% c("WC_log","emo_pos_log","emo_pos_log","emo_neg_log",
                                            "Analytic","Clout","Authentic"),]


results1 <- extract_results(lm_change_cc)
results2 <- extract_results(lm_rate_cc)
results3 <- extract_results(lm_satisfaction_cc)
results4 <- extract_results(lm_learn_cc)
results5 <- extract_results(lm_continue_cc)
results6 <- extract_results(lm_recommend_cc)

results1 <- results1[c("WC_log","emo_pos_log","emo_neg_log","Analytic","Clout","Authentic"),]
results1$DV <- "Change in Attitude"

results2 <- results2[c("WC_log","emo_pos_log","emo_neg_log","Analytic","Clout","Authentic"),]
results2$DV <- "Rating of GPT-3"

results3 <- results3[c("WC_log","emo_pos_log","emo_neg_log","Analytic","Clout","Authentic"),]
results3$DV <- "Satisfaction of GPT-3"

results4 <- results4[c("WC_log","emo_pos_log","emo_neg_log","Analytic","Clout","Authentic"),]
results4$DV <- "Learning Experience with the GPT-3"

results5 <- results5[c("WC_log","emo_pos_log","emo_neg_log","Analytic","Clout","Authentic"),]
results5$DV <- "Intention to Continue Chat"

results6 <- results6[c("WC_log","emo_pos_log","emo_neg_log","Analytic","Clout","Authentic"),]
results6$DV <- "Intention to Recommend to Others"

combined_df <- rbind(results1, results2, results3, results4, results5, results6)

combined_df <- add_color(combined_df)


p1 <- plot_subset_noy_liwc(combined_df, "WC_log") + ggtitle("Word count")
p2 <- plot_subset_noy_liwc(combined_df, "emo_pos_log") + ggtitle("Positive emotion")
p3 <- plot_subset_noy_liwc(combined_df, "emo_neg_log") + ggtitle("Negative emotion")
p4 <- plot_subset_noy_liwc(combined_df, "Analytic") + ggtitle("Analytic")
p5 <- plot_subset_noy_liwc(combined_df, "Clout") + ggtitle("Clout")
p6 <- plot_subset_noy_liwc(combined_df, "Authentic") + ggtitle("Authentic")

p_rq3_cc <- grid.arrange(p0, p1, p2, p3, p4, p5,p6, ncol = 7,
                         top=textGrob("Panel A: Climate Change",x = 0, hjust = 0,
                                      gp = gpar(fontsize = 18)))

###### > [Figure 5 B]standardized for visualization - BLM #######
LIWC_blm_std <- LIWC_blm %>% 
  select(c(raceminor ,langminor , eduminor , blmminor , ideology,
           WC_log, emo_pos_log, emo_neg_log ,Analytic, Clout, Authentic ,
           gender , age , income , bot_pre))%>%
  mutate_if(is.numeric, scale)
col_names <- colnames(LIWC_blm_std)
col_names <- gsub("\\[,1\\]", "", col_names)
colnames(LIWC_blm_std) <- col_names
LIWC_blm_rst <- LIWC_blm %>%
  select(c(change,rate,satisfaction,blm_learn,blm_continue,blm_recommend))
LIWC_blm_std <- cbind(LIWC_blm_std, LIWC_blm_rst)

lm_change_blm <- lm(change  ~ raceminor +langminor + eduminor + blmminor + ideology +
                      WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                      gender + age + income + bot_pre,
                    LIWC_blm_std)
lm_rate_blm <- lm(rate ~ raceminor +langminor + eduminor + blmminor + ideology +
                WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                gender + age + income + bot_pre,
              LIWC_blm_std)
lm_satisfaction_blm <- lm(satisfaction ~ raceminor +langminor + eduminor + blmminor + ideology +
                        WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                        gender + age + income + bot_pre,
                      LIWC_blm_std)
lm_learn_blm <- lm(blm_learn ~ raceminor +langminor + eduminor + blmminor + ideology +
                 WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                 gender + age + income + bot_pre,
               LIWC_blm_std)
lm_continue_blm <- lm(blm_continue ~ raceminor +langminor + eduminor + blmminor + ideology +
                    WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                    gender + age + income + bot_pre,
                  LIWC_blm_std)
lm_recommend_blm <- lm(blm_recommend ~ raceminor +langminor + eduminor + blmminor + ideology +
                     WC_log+ emo_pos_log+ emo_neg_log +Analytic+ Clout+ Authentic +
                     gender + age + income + bot_pre,
                   LIWC_blm_std)

results1 <- extract_results(lm_change_blm)
results2 <- extract_results(lm_rate_blm)
results3 <- extract_results(lm_satisfaction_blm)
results4 <- extract_results(lm_learn_blm)
results5 <- extract_results(lm_continue_blm)
results6 <- extract_results(lm_recommend_blm)

p1 <- plot_subset_noy_liwc(combined_df, "WC_log") + ggtitle("Word count")
p2 <- plot_subset_noy_liwc(combined_df, "emo_pos_log") + ggtitle("Positive emotion")
p3 <- plot_subset_noy_liwc(combined_df, "emo_neg_log") + ggtitle("Negative emotion")
p4 <- plot_subset_noy_liwc(combined_df, "Analytic") + ggtitle("Analytic")
p5 <- plot_subset_noy_liwc(combined_df, "Clout") + ggtitle("Clout")
p6 <- plot_subset_noy_liwc(combined_df, "Authentic") + ggtitle("Authentic")

p_rq3_blm <- grid.arrange(p0, p1, p2, p3, p4, p5,p6, ncol = 7,
                          top=textGrob("Panel B: Black Lives Matter",x = 0, hjust = 0,
                                       gp = gpar(fontsize = 18)))

grid.arrange(p_rq3_cc,p_rq3_blm,ncol=1)


##### [R&R] Relationship between human prompt and chatbot response #####
library(lme4)
###### > climate change #######
human_LIWC_cc <- read.csv("LIWC-22 Results - conv_cc_human - LIWC Analysis.csv",
                    fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                    row.names = NULL, 
                    stringsAsFactors = FALSE)
chatbot_LIWC_cc <- read.csv("LIWC-22 Results - conv_cc_bot - LIWC Analysis.csv",
                    fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                    row.names = NULL, 
                    stringsAsFactors = FALSE)


combined_LIWC_cc <- merge(human_LIWC_cc, chatbot_LIWC_cc, 
                          by = c("index", "round"))


model_WC <- lmer(WC.y ~ WC.x + round + (1 | index), 
                 data = combined_LIWC_cc)
summary(model_WC)

model_emo_pos <- lmer(emo_pos.y ~ emo_pos.x + round + (1 | index), 
                        data = combined_LIWC_cc)
summary(model_emo_pos)

model_emo_neg <- lmer(emo_neg.y ~ emo_neg.x + round + (1 | index), 
                        data = combined_LIWC_cc)
summary(model_emo_neg)

model_Analytic <- lmer(Analytic.y ~ Analytic.x + round + (1 | index), 
                        data = combined_LIWC_cc)
summary(model_Analytic)

model_Clout <- lmer(Clout.y ~ Clout.x + round + (1 | index), 
                        data = combined_LIWC_cc)
summary(model_Clout)

model_Authentic <- lmer(Authentic.y ~ Authentic.x + round + (1 | index), 
                 data = combined_LIWC_cc)
summary(model_Authentic)

modellist <- c(model_WC,model_emo_pos,model_emo_neg,
               model_Analytic,model_Clout,model_Authentic)

stargazer(modellist, type = "html", model.names = TRUE,
          out = "climate change models.html")

###### > BLM #######
human_LIWC_blm <- read.csv("LIWC-22 Results - conv_blm_human - LIWC Analysis.csv",
                          fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                          row.names = NULL, 
                          stringsAsFactors = FALSE)
chatbot_LIWC_blm <- read.csv("LIWC-22 Results - conv_blm_bot - LIWC Analysis.csv",
                            fill=TRUE, header=TRUE, sep=",", encoding="UTF-8",
                            row.names = NULL, 
                            stringsAsFactors = FALSE)


combined_LIWC_blm <- merge(human_LIWC_blm, chatbot_LIWC_blm, 
                          by = c("index", "round"))


model_WC <- lmer(WC.y ~ WC.x + round + (1 | index), 
                 data = combined_LIWC_blm)
summary(model_WC)

model_emo_pos <- lmer(emo_pos.y ~ emo_pos.x + round + (1 | index), 
                      data = combined_LIWC_blm)
summary(model_emo_pos)

model_emo_neg <- lmer(emo_neg.y ~ emo_neg.x + round + (1 | index), 
                      data = combined_LIWC_blm)
summary(model_emo_neg)

model_Analytic <- lmer(Analytic.y ~ Analytic.x + round + (1 | index), 
                       data = combined_LIWC_blm)
summary(model_Analytic)

model_Clout <- lmer(Clout.y ~ Clout.x + round + (1 | index), 
                    data = combined_LIWC_blm)
summary(model_Clout)

model_Authentic <- lmer(Authentic.y ~ Authentic.x + round + (1 | index), 
                        data = combined_LIWC_blm)
summary(model_Authentic)

modellist <- c(model_WC,model_emo_pos,model_emo_neg,
               model_Analytic,model_Clout,model_Authentic)

stargazer(modellist, type = "html", model.names = TRUE,
          out = "BLM models.html")


